In [1]:
from wikidataintegrator import wdi_core, wdi_login
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
from tqdm.notebook import trange, tqdm
import ipywidgets
import widgetsnbextension
import time
In [57]:
print("retrieving API credentials")
import wdi_user_config
api_dict = wdi_user_config.get_gard_credentials()
header_info = {api_dict['name']: api_dict['value']}
In [59]:
print("Logging in...")
import wdi_user_config ## Credentials stored in a wdi_user_config file
login_dict = wdi_user_config.get_credentials()
login = wdi_login.WDLogin(login_dict['WDUSER'], login_dict['WDPASS'])
In [58]:
gard_results = requests.get('https://api.rarediseases.info.nih.gov/api/diseases',
headers=header_info)
print(gard_results)
In [59]:
gard_df = pd.read_json(gard_results.text)
print(gard_df.head(n=2))
Although we can easily pull the synonyms from this dataframe and upload them to Wikidata, we only have permission to upload data specifically generated by GARD. Hence we will need to visit each disease's page in GARD to check the source of the synonyms. While we're at it, we can also pull alternate identifiers (which will NOT be loaded to Wikidata), but can be used for mapping. Since the Mix N Match community has already done a lot of GARD ID mapping, we will only need these alternative identifiers for items which don't yet have GARD IDs mapped.
In [82]:
## The resulting json file has a key "mainPropery" which is where our desired data is stored
## Since it looks like a misspelling, we'll store that key as a variable so that it'll be easy to
## change in the future if the key is changed in the future
key_of_interest = "mainPropery"
In [68]:
"""
## Unit test: Request and parse a sample page
i=1
fail_list = []
sample_result = requests.get('https://api.rarediseases.info.nih.gov/api/diseases/'+str(gard_df.iloc[i]['diseaseId']),
headers=header_info)
json_result = sample_result.json()
data_of_interest = json_result.get(key_of_interest)
## Check if there are synonyms that don't have a source (ie- are by GARD)
sourced_syn = data_of_interest.get('synonyms-with-source')
identifier_results = data_of_interest.get('identifiers')
tmpdict = pd.DataFrame(sourced_syn).fillna("GARD")
tmpdict['diseaseId'] = gard_df.iloc[i]['diseaseId']
print(tmpdict)
## Check if there are identifiers that can be used for xrefs
identifier_dict = pd.DataFrame(identifier_results).fillna("None")
print(identifier_dict)
"""
In [88]:
gard_id_list = gard_df['diseaseId'].unique().tolist()
#gard_id_list = [13018,5658,10095] ## Iteration test
fail_list = []
no_syns = []
no_idens = []
identifier_df = pd.DataFrame(columns=['diseaseId','identifierId','identifierType'])
synonyms_df = pd.DataFrame(columns=['diseaseId','name','source'])
for i in tqdm(range(len(gard_id_list))):
try:
sample_result = requests.get('https://api.rarediseases.info.nih.gov/api/diseases/'+str(gard_df.iloc[i]['diseaseId']),
headers=header_info)
json_result = sample_result.json()
data_of_interest = json_result.get(key_of_interest)
## Check if there are synonyms that don't have a source (ie- are by GARD)
sourced_syn = data_of_interest.get('synonyms-with-source')
tmpdict = pd.DataFrame(sourced_syn).fillna("GARD")
tmpdict['diseaseId'] = gard_df.iloc[i]['diseaseId']
if len(tmpdict) == 0:
no_syns.append(gard_df.iloc[i]['diseaseId'])
else:
synonyms_df = pd.concat((synonyms_df,tmpdict),ignore_index=True)
## Check if there are identifiers that can be used for xrefs
identifier_results = data_of_interest.get('identifiers')
identifier_dict = pd.DataFrame(identifier_results).fillna("None")
identifier_dict['diseaseId'] = gard_df.iloc[i]['diseaseId']
if len(identifier_dict) == 0:
no_idens.append(gard_df.iloc[i]['diseaseId'])
else:
identifier_df = pd.concat((identifier_df,identifier_dict),ignore_index=True)
except:
fail_list.append(gard_df.iloc[i]['diseaseId'])
print("Identifiers found: ", len(identifier_df))
print("Synonyms found: ", len(synonyms_df))
print("Requests failed: ",len(fail_list))
print("GARD IDs with no synonyms: ", len(no_syns))
print("GARD IDs with no xrefs: ", len(no_idens))
In [90]:
## Export results to avoid having to hit the API again
identifier_df.to_csv('data/identifier_df.tsv',sep='\t',header=True)
synonyms_df.to_csv('data/synonyms_df.tsv',sep='\t',header=True)
with open('data/no_syns.txt','w') as outwrite:
for eachentry in no_syns:
outwrite.write(str(eachentry)+'\n')
with open('data/no_idens.txt','w') as idenwrite:
for eachiden in no_idens:
idenwrite.write(str(eachiden)+'\n')
In [76]:
print(identifier_df)
In [2]:
identifier_df = read_csv('data/identifier_df.tsv',delimiter='\t',header=0,index_col=0)
synonyms_df = read_csv('data/synonyms_df.tsv',delimiter='\t',header=0,index_col=0, encoding='latin-1')
no_syns=[]
with open('data/no_syns.txt','r') as syn_read:
for line in syn_read:
no_syns.append(line.strip('\n'))
no_idens=[]
with open('data/no_idens.txt','r') as iden_read:
for line in no_idens:
no_idens.append(line.strip('\n'))
In [3]:
# Retrieve all QIDs with GARD IDs
sparqlQuery = "SELECT * WHERE {?item wdt:P4317 ?GARD}"
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
In [4]:
gard_in_wd_list = []
for i in tqdm(range(len(result["results"]["bindings"]))):
gard_id = result["results"]["bindings"][i]["GARD"]["value"]
wdid = result["results"]["bindings"][i]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
gard_in_wd_list.append({'WDID':wdid,'diseaseId':gard_id})
gard_in_wd = pd.DataFrame(gard_in_wd_list)
print(gard_in_wd.head(n=3))
Currently, there is no bot to add GARD ID to Wikidata entities, so the GARD IDs in Wikidata were added via Mix N Match. Identify the GARD diseases not yet in Wikidata, and determine if they can be mapped using one of the other identifiers available via GARD (eg- Orphanet)
In [14]:
gard_in_wd_id_list = gard_in_wd['diseaseId'].unique().tolist()
gard_not_in_wd = identifier_df.loc[~identifier_df['diseaseId'].isin(gard_in_wd_id_list)]
print(len(gard_not_in_wd))
print(len(gard_not_in_wd['diseaseId'].unique().tolist()))
print(gard_not_in_wd.head(n=2))
property_list = gard_not_in_wd['identifierType'].unique().tolist()
print(property_list)
In [16]:
prop_id_dict = {'OMIM':'P492', 'ORPHANET':'P1550', 'UMLS':'P2892',
'SNOMED CT':'P5806', 'ICD 10':'P494', 'NCI Thesaurus':'P1748',
'ICD 10-CM':'P4229', 'MeSH':'P486'}
print(prop_id_dict['OMIM'])
In [26]:
sparql_start = 'SELECT * WHERE {?item wdt:'
sparql_end = '}'
identifier_megalist=[]
for eachidtype in property_list:
sparqlQuery = sparql_start + prop_id_dict[eachidtype] + ' ?identifierId'+sparql_end
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
for i in tqdm(range(len(result["results"]["bindings"]))):
id_id = result["results"]["bindings"][i]['identifierId']["value"]
wdid = result["results"]["bindings"][i]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
identifier_megalist.append({'WDID':wdid,'identifierId':id_id, 'identifierType':eachidtype})
print(len(identifier_megalist))
time.sleep(2)
identifier_megadf = pd.DataFrame(identifier_megalist)
identifier_megadf.to_csv('data/identifier_megadf.tsv',sep='\t',header=True)
In [61]:
## For each Gard Disease Entry, check for multiple mappings to the same WDID
missing_gard_merge = gard_not_in_wd.merge(identifier_megadf,on=(['identifierId', 'identifierType']), how="inner")
still_missing = gard_not_in_wd.loc[~gard_not_in_wd['diseaseId'].isin(missing_gard_merge['diseaseId'].unique().tolist())]
print("Disease IDs for which identifiers couldn't be used to find a QID: ",len(still_missing))
## Determine the number of identifiers that support a merge
potential_gard = missing_gard_merge.groupby(['diseaseId','WDID']).size().reset_index(name='identifier_count')
mapping_check1 = gard_ids_to_add.groupby('diseaseId').size().reset_index(name='qid_count')
one_to_many = mapping_check1.loc[mapping_check1['qid_count']>1]
#print(len(one_to_many))
mapping_check2 = gard_ids_to_add.groupby('WDID').size().reset_index(name='gardid_count')
many_to_one = mapping_check2.loc[mapping_check2['gardid_count']>1]
#print(len(many_to_one))
gard_mapping_issue_ids = one_to_many['diseaseId'].unique().tolist() + many_to_one['WDID'].unique().tolist()
gard_to_add = potential_gard.loc[~potential_gard['diseaseId'].isin(gard_mapping_issue_ids) &
~potential_gard['WDID'].isin(gard_mapping_issue_ids) &
~potential_gard['diseaseId'].isin(still_missing)]
gard_to_add_full = gard_to_add.merge(gard_df,on='diseaseId',how="left")
gard_to_auto_add = gard_to_add_full.loc[gard_to_add_full['identifier_count']>1]
gard_to_suggest = gard_to_add_full.loc[gard_to_add_full['identifier_count']==1]
print(gard_to_auto_add.head(n=2))
After removing items which have issues with no alternative identifier by which the GARD entry can be mapped, gard entries that map to multiple Wikidata entities, and multiple gard entries that map to a single wikidata entity based entirely on the other identifiers for that entry provided by GARD, we're left with entries we can add and suggest. Entries which map to a single WDID based on MULTIPLE Identifier mappings can be scripted. Entities which map to a single WDID based on a single Identifier, would probably be best sent to Mix N Match to avoid complaints further down the line.
In [62]:
# GARD rare disease ID P4317
from datetime import datetime
import copy
def create_reference(gard_url):
refStatedIn = wdi_core.WDItemID(value="Q47517289", prop_nr="P248", is_reference=True)
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")
refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True)
refURL = wdi_core.WDUrl(value=gard_url, prop_nr="P854", is_reference=True)
return [refStatedIn, refRetrieved, refURL]
In [68]:
## Unit test -- write a statement
gard_qid = gard_to_auto_add.iloc[1]['WDID']
gard_url = gard_to_auto_add.iloc[1]['websiteUrl']
gard_id = str(gard_to_auto_add.iloc[1]['diseaseId'])
reference = create_reference(gard_url)
gard_prop = "P4317"
statement = [wdi_core.WDString(value=gard_id, prop_nr=gard_prop, references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=gard_qid, data=statement, append_value=gard_prop,
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login)
edit_id = item.lastrevid
print(gard_id, gard_qid, gard_url)
In [70]:
## Test write with 10 items completed successfully
gard_map_revision_list = []
i=0
for i in tqdm(range(len(gard_to_auto_add))):
gard_qid = gard_to_auto_add.iloc[i]['WDID']
gard_url = gard_to_auto_add.iloc[i]['websiteUrl']
gard_id = str(gard_to_auto_add.iloc[i]['diseaseId'])
reference = create_reference(gard_url)
gard_prop = "P4317"
statement = [wdi_core.WDString(value=gard_id, prop_nr=gard_prop, references=[copy.deepcopy(reference)])]
item = wdi_core.WDItemEngine(wd_item_id=gard_qid, data=statement, append_value=gard_prop,
global_ref_mode='CUSTOM', ref_handler=update_retrieved_if_new_multiple_refs)
item.write(login,edit_summary='added GARD ID')
gard_map_revision_list.append(item.lastrevid)
i=i+1
In [ ]:
## Export the revision list
with open('data/mapping_revisions.txt','w') as outwritelog:
for eachrevid in gard_map_revision_list:
outwritelog.write(str(eachrevid)+'\n')
In [10]:
## pull aliases for all entries with GARD IDs
sparqlQuery = 'SELECT ?item ?itemLabel ?GARD ?alias WHERE {?item wdt:P4317 ?GARD. OPTIONAL {?item skos:altLabel ?alias FILTER (LANG (?alias) = "en").} SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}'
result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
In [37]:
## Format the results from the Wikidata query into Pandas DF for easier manipulation
gard_alias_in_wd_list = []
for i in tqdm(range(len(result["results"]["bindings"]))):
gard_id = result["results"]["bindings"][i]["GARD"]["value"]
wdid = result["results"]["bindings"][i]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
label = result["results"]["bindings"][i]["itemLabel"]["value"]
try:
alias = result["results"]["bindings"][i]["alias"]["value"]
except:
alias = "No alias"
gard_alias_in_wd_list.append({'WDID':wdid,'diseaseId':int(gard_id),'label':label,'alias':alias})
## Note that Wikidata stores the GARD IDs at strings, while GARD stores as int. Convert to ensure matchability
gard_alias_in_wd = pd.DataFrame(gard_alias_in_wd_list)
print(gard_alias_in_wd.head(n=3))
In [55]:
## Pull the aliases that are sourced from GARD
gard_alias = synonyms_df.loc[synonyms_df['source']=='GARD']
## Filter the Wikidata GARD Alias table down to just the GARD IDs in GARD alias DF (ie- has allowable synonyms)
gard_wd_limited_df = gard_alias_in_wd.loc[gard_alias_in_wd['diseaseId'].isin(gard_alias['diseaseId'].unique().tolist())]
alias_check_df = gard_alias.merge(gard_wd_limited_df,on='diseaseId',how='inner').copy()
## Check if the GARD synonym matches anything in the corresponding Wikidata label or alias
alias_check_df['label_match?'] = alias_check_df['name'].str.lower()==alias_check_df['label'].str.lower()
alias_check_df['alias_match?'] = alias_check_df['name'].str.lower()==alias_check_df['alias'].str.lower()
## Identify the GARD synonyms that were found in Wikidata (label or aliases) for removal
synonyms_to_drop = alias_check_df['name'].loc[(alias_check_df['label_match?']==True) |
(alias_check_df['alias_match?']==True)].unique().tolist()
## Filter out GARD entries that were found in Wikidata
synonyms_to_inspect = alias_check_df.loc[~alias_check_df['name'].isin(synonyms_to_drop)]
## Identify the synonyms to add to wikidata as an alias
synonyms_to_add = synonyms_to_inspect.drop_duplicates(subset=['diseaseId','name','source','WDID','label'], keep='first')
print(synonyms_to_add.head(n=4))
print(len(synonyms_to_add))
In [63]:
disease_qid = synonyms_to_add.iloc[0]['WDID']
disease_alias = synonyms_to_add.iloc[0]['name']
In [57]:
print(disease_qid,disease_alias)
In [73]:
## Unit test -- write a statement
wikidata_item = wdi_core.WDItemEngine(wd_item_id=disease_qid)
wikidata_item.set_aliases([disease_alias],lang='en',append=True)
wikidata_item.write(login, edit_summary='added alias from GARD')
print(wikidata_item.get_aliases(lang='en'))
print(wikidata_item.lastrevid)
#wikidata_item.get_aliases(lang='en')
In [ ]:
## Script to run the synonym updates
gard_alias_revision_list = []
i=0
for i in tqdm(range(len(gard_to_auto_add))):
disease_qid = synonyms_to_add.iloc[i]['WDID']
disease_alias = synonyms_to_add.iloc[i]['name']
wikidata_item = wdi_core.WDItemEngine(wd_item_id=disease_qid)
wikidata_item.set_aliases([disease_alias],lang='en',append=True)
wikidata_item.write(login, edit_summary='added alias from GARD')
gard_alias_revision_list.append(wikidata_item.lastrevid)
i=i+1
In [ ]:
## Export the revision list
with open('data/alias_revisions.txt','w') as aliaslog:
for eachrevid in gard_alias_revision_list:
aliaslog.write(str(eachrevid)+'\n')